package org.apache.solr.util;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/**
 * A simple utility class for posting raw updates to a Solr server, 
 * has a main method so it can be run on the command line.
 * View this not as a best-practice code example, but as a standalone 
 * example built with an explicit purpose of not having external
 * jar dependencies.
 */
public class SimplePostTool {

    private static final String DEFAULT_POST_URL = "http://localhost:8983/solr/update";
    private static final String VERSION_OF_THIS_TOOL = "1.5";
    private static final String DEFAULT_COMMIT = "yes";
    private static final String DEFAULT_OPTIMIZE = "no";
    private static final String DEFAULT_OUT = "no";
    private static final String DEFAULT_AUTO = "no";
    private static final String DEFAULT_RECURSIVE = "0";
    private static final int DEFAULT_WEB_DELAY = 10;
    private static final int MAX_WEB_DEPTH = 10;
    private static final String DEFAULT_CONTENT_TYPE = "application/xml";
    private static final String DEFAULT_FILE_TYPES = "xml,json,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log";
    static final String DATA_MODE_FILES = "files";
    static final String DATA_MODE_ARGS = "args";
    static final String DATA_MODE_STDIN = "stdin";
    static final String DATA_MODE_WEB = "web";
    static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
    // Input args
    boolean auto = false;
    int recursive = 0;
    int delay = 0;
    String fileTypes;
    URL solrUrl;
    OutputStream out = null;
    String type;
    String mode;
    boolean commit;
    boolean optimize;
    String[] args;
    private int currentDepth;
    static HashMap<String, String> mimeMap;
    GlobFileFilter globFileFilter;
    // Backlog for crawling
    List<LinkedHashSet<URL>> backlog = new ArrayList<>();
    Set<URL> visited = new HashSet<>();
    static final Set<String> DATA_MODES = new HashSet<>();
    static final String USAGE_STRING_SHORT = "Usage: java [SystemProperties] -jar post.jar [-h|-] [<file|folder|url|arg> [<file|folder|url|arg>...]]";
    // Used in tests to avoid doing actual network traffic
    static boolean mockMode = false;
    static PageFetcher pageFetcher;

    static {
        DATA_MODES.add(DATA_MODE_FILES);
        DATA_MODES.add(DATA_MODE_ARGS);
        DATA_MODES.add(DATA_MODE_STDIN);
        DATA_MODES.add(DATA_MODE_WEB);

        mimeMap = new HashMap<>();
        mimeMap.put("xml", "text/xml");
        mimeMap.put("csv", "text/csv");
        mimeMap.put("json", "application/json");
        mimeMap.put("pdf", "application/pdf");
        mimeMap.put("rtf", "text/rtf");
        mimeMap.put("html", "text/html");
        mimeMap.put("htm", "text/html");
        mimeMap.put("doc", "application/msword");
        mimeMap.put("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
        mimeMap.put("ppt", "application/vnd.ms-powerpoint");
        mimeMap.put("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
        mimeMap.put("xls", "application/vnd.ms-excel");
        mimeMap.put("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
        mimeMap.put("odt", "application/vnd.oasis.opendocument.text");
        mimeMap.put("ott", "application/vnd.oasis.opendocument.text");
        mimeMap.put("odp", "application/vnd.oasis.opendocument.presentation");
        mimeMap.put("otp", "application/vnd.oasis.opendocument.presentation");
        mimeMap.put("ods", "application/vnd.oasis.opendocument.spreadsheet");
        mimeMap.put("ots", "application/vnd.oasis.opendocument.spreadsheet");
        mimeMap.put("txt", "text/plain");
        mimeMap.put("log", "text/plain");
    }

    /**
     * See usage() for valid command line usage
     *
     * @param args the params on the command line
     */
    public static void main(String[] args) {

        info("SimplePostTool version " + VERSION_OF_THIS_TOOL);
        if (0 < args.length && ("-help".equals(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0]))) {
            usage();
        }
        else {
            final SimplePostTool t = parseArgsAndInit(args);
            t.execute();
        }
    }

    /**
     * After initialization, call execute to start the post job. This method
     * delegates to the correct mode method.
     */
    public void execute() {
        if (DATA_MODE_FILES.equals(mode) && args.length > 0) {
            doFilesMode();
        }
        else if (DATA_MODE_ARGS.equals(mode) && args.length > 0) {
            doArgsMode();
        }
        else if (DATA_MODE_WEB.equals(mode) && args.length > 0) {
            doWebMode();
        }
        else if (DATA_MODE_STDIN.equals(mode)) {
            doStdinMode();
        }
        else {
            usageShort();
            return;
        }

        if (commit) {
            commit();
        }
        if (optimize) {
            optimize();
        }
    }

    /**
     * Parses incoming arguments and system params and initializes the tool
     *
     * @param args the incoming cmd line args
     * @return an instance of SimplePostTool
     */
    protected static SimplePostTool parseArgsAndInit(String[] args) {

        String urlStr = null;
        try {
            // Parse args
            final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
            if (!DATA_MODES.contains(mode)) {
                fatal("System Property 'data' is not valid for this tool: " + mode);
            }

            String params = System.getProperty("params", "");
            urlStr = System.getProperty("url", SimplePostTool.appendParam(DEFAULT_POST_URL, params));
            URL url = new URL(urlStr);
            boolean auto = isOn(System.getProperty("auto", DEFAULT_AUTO));
            String type = System.getProperty("type");

            // Recursive
            int recursive = 0;
            String r = System.getProperty("recursive", DEFAULT_RECURSIVE);
            try {
                recursive = Integer.parseInt(r);
            }
            catch (Exception e) {
                if (isOn(r)) {
                    recursive = DATA_MODE_WEB.equals(mode) ? 1 : 999;
                }
            }

            // Delay
            int delay = DATA_MODE_WEB.equals(mode) ? DEFAULT_WEB_DELAY : 0;
            try {
                delay = Integer.parseInt(System.getProperty("delay", "" + delay));
            }
            catch (Exception e) { }

            OutputStream out = isOn(System.getProperty("out", DEFAULT_OUT)) ? System.out : null;
            String fileTypes = System.getProperty("filetypes", DEFAULT_FILE_TYPES);
            boolean commit = isOn(System.getProperty("commit", DEFAULT_COMMIT));
            boolean optimize = isOn(System.getProperty("optimize", DEFAULT_OPTIMIZE));

            return new SimplePostTool(mode, url, auto, type, recursive, delay, fileTypes, out, commit, optimize, args);
        }
        catch (MalformedURLException e) {
            fatal("System Property 'url' is not a valid URL: " + urlStr);
            return null;
        }
    }

    /**
     * Constructor which takes in all mandatory input for the tool to work. Also
     * see usage() for further explanation of the params.
     *
     * @param mode whether to post files, web pages, params or stdin
     * @param url the Solr base Url to post to, should end with /update
     * @param auto if true, we'll guess type and add resourcename/url
     * @param type content-type of the data you are posting
     * @param recursive number of levels for file/web mode, or 0 if one file
     * only
     * @param delay if recursive then delay will be the wait time between posts
     * @param fileTypes a comma separated list of file-name endings to accept
     * for file/web
     * @param out an OutputStream to write output to, e.g. stdout to print to
     * console
     * @param commit if true, will commit at end of posting
     * @param optimize if true, will optimize at end of posting
     * @param args a String[] of arguments, varies between modes
     */
    public SimplePostTool(String mode, URL url, boolean auto, String type,
            int recursive, int delay, String fileTypes, OutputStream out,
            boolean commit, boolean optimize, String[] args) {

        this.mode = mode;
        this.solrUrl = url;
        this.auto = auto;
        this.type = type;
        this.recursive = recursive;
        this.delay = delay;
        this.fileTypes = fileTypes;
        this.globFileFilter = getFileFilterFromFileTypes(fileTypes);
        this.out = out;
        this.commit = commit;
        this.optimize = optimize;
        this.args = args;
        pageFetcher = new PageFetcher();
    }

    public SimplePostTool() {
    }

    //
    // Do some action depending on which mode we have
    //
    private void doFilesMode() {

        currentDepth = 0;
        // Skip posting files if special param "-" given  
        if (!args[0].equals("-")) {
            info("Posting files to base url " + solrUrl + (!auto ? " using content-type " + (type == null ? DEFAULT_CONTENT_TYPE : type) : "") + "..");
            if (auto) {
                info("Entering auto mode. File endings considered are " + fileTypes);
            }
            if (recursive > 0) {
                info("Entering recursive mode, max depth=" + recursive + ", delay=" + delay + "s");
            }
            int numFilesPosted = postFiles(args, 0, out, type);
            info(numFilesPosted + " files indexed.");
        }
    }

    private void doArgsMode() {

        info("POSTing args to " + solrUrl + "..");
        for (String a : args) {
            postData(stringToStream(a), null, out, type, solrUrl);
        }
    }

    private int doWebMode() {

        reset();
        int numPagesPosted = 0;
        try {
            if (type != null) {
                fatal("Specifying content-type with \"-Ddata=web\" is not supported");
            }
            if (args[0].equals("-")) {
                // Skip posting url if special param "-" given  
                return 0;
            }
            // Set Extracting handler as default
            solrUrl = appendUrlPath(solrUrl, "/extract");

            info("Posting web pages to Solr url " + solrUrl);
            auto = true;
            info("Entering auto mode. Indexing pages with content-types corresponding to file endings " + fileTypes);
            if (recursive > 0) {
                if (recursive > MAX_WEB_DEPTH) {
                    recursive = MAX_WEB_DEPTH;
                    warn("Too large recursion depth for web mode, limiting to " + MAX_WEB_DEPTH + "...");
                }
                if (delay < DEFAULT_WEB_DELAY) {
                    warn("Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked");
                }
                info("Entering recursive mode, depth=" + recursive + ", delay=" + delay + "s");
            }
            numPagesPosted = postWebPages(args, 0, out);
            info(numPagesPosted + " web pages indexed.");
        }
        catch (MalformedURLException e) {
            fatal("Wrong URL trying to append /extract to " + solrUrl);
        }
        return numPagesPosted;
    }

    private void doStdinMode() {

        info("POSTing stdin to " + solrUrl + "..");
        postData(System.in, null, out, type, solrUrl);
    }

    private void reset() {

        fileTypes = DEFAULT_FILE_TYPES;
        globFileFilter = this.getFileFilterFromFileTypes(fileTypes);
        backlog = new ArrayList<>();
        visited = new HashSet<>();
    }

    //
    // USAGE
    //
    private static void usageShort() {
        System.out.println(USAGE_STRING_SHORT + "\n       Please invoke with -h option for extended usage help.");
    }

    private static void usage() {

        System.out.println(USAGE_STRING_SHORT + "\n\n"
                + "Supported System Properties and their defaults:\n"
                + "  -Ddata=files|web|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"
                + "  -Dtype=<content-type> (default=" + DEFAULT_CONTENT_TYPE + ")\n"
                + "  -Durl=<solr-update-url> (default=" + DEFAULT_POST_URL + ")\n"
                + "  -Dauto=yes|no (default=" + DEFAULT_AUTO + ")\n"
                + "  -Drecursive=yes|no|<depth> (default=" + DEFAULT_RECURSIVE + ")\n"
                + "  -Ddelay=<seconds> (default=0 for files, 10 for web)\n"
                + "  -Dfiletypes=<type>[,<type>,...] (default=" + DEFAULT_FILE_TYPES + ")\n"
                + "  -Dparams=\"<key>=<value>[&<key>=<value>...]\" (values must be URL-encoded)\n"
                + "  -Dcommit=yes|no (default=" + DEFAULT_COMMIT + ")\n"
                + "  -Doptimize=yes|no (default=" + DEFAULT_OPTIMIZE + ")\n"
                + "  -Dout=yes|no (default=" + DEFAULT_OUT + ")\n\n"
                + "This is a simple command line tool for POSTing raw data to a Solr\n"
                + "port.  Data can be read from files specified as commandline args,\n"
                + "URLs specified as args, as raw commandline arg strings or via STDIN.\n"
                + "Examples:\n"
                + "  java -jar post.jar *.xml\n"
                + "  java -Ddata=args  -jar post.jar '<delete><id>42</id></delete>'\n"
                + "  java -Ddata=stdin -jar post.jar < hd.xml\n"
                + "  java -Ddata=web -jar post.jar http://example.com/\n"
                + "  java -Dtype=text/csv -jar post.jar *.csv\n"
                + "  java -Dtype=application/json -jar post.jar *.json\n"
                + "  java -Durl=http://localhost:8983/solr/update/extract -Dparams=literal.id=a -Dtype=application/pdf -jar post.jar a.pdf\n"
                + "  java -Dauto -jar post.jar *\n"
                + "  java -Dauto -Drecursive -jar post.jar afolder\n"
                + "  java -Dauto -Dfiletypes=ppt,html -jar post.jar afolder\n"
                + "The options controlled by System Properties include the Solr\n"
                + "URL to POST to, the Content-Type of the data, whether a commit\n"
                + "or optimize should be executed, and whether the response should\n"
                + "be written to STDOUT. If auto=yes the tool will try to set type\n"
                + "and url automatically from file name. When posting rich documents\n"
                + "the file name will be propagated as \"resource.name\" and also used\n"
                + "as \"literal.id\". You may override these or any other request parameter\n"
                + "through the -Dparams property. To do a commit only, use \"-\" as argument.\n"
                + "The web mode is a simple crawler following links within domain, default delay=10s.");
    }

    /**
     * Post all filenames provided in args
     *
     * @param args array of file names
     * @param startIndexInArgs offset to start
     * @param out output stream to post data to
     * @param type default content-type to use when posting (may be overridden
     * in auto mode)
     * @return number of files posted
   *
     */
    public int postFiles(String[] args, int startIndexInArgs, OutputStream out, String type) {

        reset();
        int filesPosted = 0;
        for (int j = startIndexInArgs; j < args.length; j++) {
            File srcFile = new File(args[j]);
            if (srcFile.isDirectory() && srcFile.canRead()) {
                filesPosted += postDirectory(srcFile, out, type);
            }
            else if (srcFile.isFile() && srcFile.canRead()) {
                filesPosted += postFiles(new File[]{srcFile}, out, type);
            }
            else {
                File parent = srcFile.getParentFile();
                if (parent == null) {
                    parent = new File(".");
                }
                String fileGlob = srcFile.getName();
                GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
                File[] files = parent.listFiles(ff);
                if (files == null || files.length == 0) {
                    warn("No files or directories matching " + srcFile);
                    continue;
                }
                filesPosted += postFiles(parent.listFiles(ff), out, type);
            }
        }
        return filesPosted;
    }

    /**
     * Post all filenames provided in args
     *
     * @param files array of Files
     * @param startIndexInArgs offset to start
     * @param out output stream to post data to
     * @param type default content-type to use when posting (may be overridden
     * in auto mode)
     * @return number of files posted
   *
     */
    public int postFiles(File[] files, int startIndexInArgs, OutputStream out, String type) {

        reset();
        int filesPosted = 0;
        for (File srcFile : files) {
            if (srcFile.isDirectory() && srcFile.canRead()) {
                filesPosted += postDirectory(srcFile, out, type);
            }
            else if (srcFile.isFile() && srcFile.canRead()) {
                filesPosted += postFiles(new File[]{srcFile}, out, type);
            }
            else {
                File parent = srcFile.getParentFile();
                if (parent == null) {
                    parent = new File(".");
                }
                String fileGlob = srcFile.getName();
                GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
                File[] fileList = parent.listFiles(ff);
                if (fileList == null || fileList.length == 0) {
                    warn("No files or directories matching " + srcFile);
                    continue;
                }
                filesPosted += postFiles(fileList, out, type);
            }
        }
        return filesPosted;
    }

    /**
     * Posts a whole directory
     * @return number of files posted total
     */
    private int postDirectory(File dir, OutputStream out, String type) {

        if (dir.isHidden() && !dir.getName().equals(".")) {
            return (0);
        }
        info("Indexing directory " + dir.getPath() + " (" + dir.listFiles(globFileFilter).length + " files, depth=" + currentDepth + ")");

        int posted = 0;
        posted += postFiles(dir.listFiles(globFileFilter), out, type);
        if (recursive > currentDepth) {
            for (File d : dir.listFiles()) {
                if (d.isDirectory()) {
                    currentDepth++;
                    posted += postDirectory(d, out, type);
                    currentDepth--;
                }
            }
        }
        return posted;
    }

    /**
     * Posts a list of file names
     *
     * @return number of files posted
     */
    int postFiles(File[] files, OutputStream out, String type) {

        int filesPosted = 0;
        for (File srcFile : files) {
            try {
                if (!srcFile.isFile() || srcFile.isHidden()) {
                    continue;
                }
                postFile(srcFile, out, type);
                Thread.sleep(delay * 1000);
                filesPosted++;
            }
            catch (InterruptedException e) {
                throw new RuntimeException();
            }
        }
        return filesPosted;
    }

    /**
     * This method takes as input a list of start URL strings for crawling, adds
     * each one to a backlog and then starts crawling
     *
     * @param args the raw input args from main()
     * @param startIndexInArgs offset for where to start
     * @param out outputStream to write results to
     * @return the number of web pages posted
     */
    public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) {

        reset();
        LinkedHashSet<URL> s = new LinkedHashSet<>();
        for (int j = startIndexInArgs; j < args.length; j++) {
            try {
                URL u = new URL(normalizeUrlEnding(args[j]));
                s.add(u);
            }
            catch (MalformedURLException e) {
                warn("Skipping malformed input URL: " + args[j]);
            }
        }
        // Add URLs to level 0 of the backlog and start recursive crawling
        backlog.add(s);
        return webCrawl(0, out);
    }

    /**
     * Normalizes a URL string by removing anchor part and trailing slash
     *
     * @return the normalized URL string
     */
    protected static String normalizeUrlEnding(String link) {

        if (link.indexOf("#") > -1) {
            link = link.substring(0, link.indexOf("#"));
        }
        if (link.endsWith("?")) {
            link = link.substring(0, link.length() - 1);
        }
        if (link.endsWith("/")) {
            link = link.substring(0, link.length() - 1);
        }
        return link;
    }

    /**
     * A very simple crawler, pulling URLs to fetch from a backlog and then
     * recurses N levels deep if recursive>0. Links are parsed from HTML through
     * first getting an XHTML version using SolrCell with extractOnly, and
     * followed if they are local. The crawler pauses for a default delay of 10
     * seconds between each fetch, this can be configured in the delay variable.
     * This is only meant for test purposes, as it does not respect robots or
     * anything else fancy :)
     *
     * @param level which level to crawl
     * @param out output stream to write to
     * @return number of pages crawled on this level and below
     */
    protected int webCrawl(int level, OutputStream out) {

        int numPages = 0;
        LinkedHashSet<URL> stack = backlog.get(level);
        int rawStackSize = stack.size();
        stack.removeAll(visited);
        int stackSize = stack.size();

        LinkedHashSet<URL> subStack = new LinkedHashSet<>();
        info("Entering crawl at level " + level + " (" + rawStackSize + " links total, " + stackSize + " new)");
        for (URL u : stack) {
            try {
                visited.add(u);
                PageFetcherResult result = pageFetcher.readPageFromUrl(u);
                if (result.httpStatus == 200) {
                    u = (result.redirectUrl != null) ? result.redirectUrl : u;
                    URL postUrl = new URL(appendParam(solrUrl.toString(),
                            "literal.id=" + URLEncoder.encode(u.toString(), "UTF-8")
                            + "&literal.url=" + URLEncoder.encode(u.toString(), "UTF-8")));

                    boolean success = postData(new ByteArrayInputStream(result.content), null, out, result.contentType, postUrl);
                    if (success) {
                        info("POSTed web resource " + u + " (depth: " + level + ")");
                        Thread.sleep(delay * 1000);
                        numPages++;
                        // Pull links from HTML pages only
                        if (recursive > level && result.contentType.equals("text/html")) {
                            Set<URL> children = pageFetcher.getLinksFromWebPage(u, new ByteArrayInputStream(result.content), result.contentType, postUrl);
                            subStack.addAll(children);
                        }
                    }
                    else {
                        warn("An error occurred while posting " + u);
                    }
                } else {
                    warn("The URL " + u + " returned a HTTP result status of " + result.httpStatus);
                }
            }
            catch (IOException e) {
                warn("Caught exception when trying to open connection to " + u + ": " + e.getMessage());
            }
            catch (InterruptedException e) {
                throw new RuntimeException();
            }
        }
        if (!subStack.isEmpty()) {
            backlog.add(subStack);
            numPages += webCrawl(level + 1, out);
        }
        return numPages;
    }

    /**
     * Reads an input stream into a byte array
     *
     * @param is the input stream
     * @return the byte array
     * @throws IOException If there is a low-level I/O error.
     */
    protected byte[] inputStreamToByteArray(InputStream is) throws IOException {

        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        int next = is.read();
        while (next > -1) {
            bos.write(next);
            next = is.read();
        }
        bos.flush();
        is.close();
        return bos.toByteArray();
    }

    /**
     * Computes the full URL based on a base url and a possibly relative link
     * found in the href param of an HTML anchor.
     *
     * @param baseUrl the base url from where the link was found
     * @param link the absolute or relative link
     * @return the string version of the full URL
     */
    protected String computeFullUrl(URL baseUrl, String link) {

        if (link == null || link.length() == 0) {
            return null;
        }
        if (!link.startsWith("http")) {
            if (link.startsWith("/")) {
                link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
            }
            else {
                if (link.contains(":")) {
                    return null; // Skip non-relative URLs
                }
                String path = baseUrl.getPath();
                if (!path.endsWith("/")) {
                    int sep = path.lastIndexOf("/");
                    String file = path.substring(sep + 1);
                    if (file.contains(".") || file.contains("?")) {
                        path = path.substring(0, sep);
                    }
                }
                link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link;
            }
        }
        link = normalizeUrlEnding(link);
        String l = link.toLowerCase(Locale.ROOT);
        // Simple brute force skip images
        if (l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) {
            return null; // Skip images
        }
        return link;
    }

    /**
     * Uses the mime-type map to reverse lookup whether the file ending for our
     * type is supported by the fileTypes option
     *
     * @param type what content-type to lookup
     * @return true if this is a supported content type
     */
    protected boolean typeSupported(String type) {

        for (String key : mimeMap.keySet()) {
            if (mimeMap.get(key).equals(type)) {
                if (fileTypes.contains(key)) {
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * Tests if a string is either "true", "on", "yes" or "1"
     *
     * @param property the string to test
     * @return true if "on"
     */
    protected static boolean isOn(String property) {
        return ("true,on,yes,1".indexOf(property) > -1);
    }

    static void warn(String msg) {
        System.err.println("SimplePostTool: WARNING: " + msg);
    }

    static void info(String msg) {
        System.out.println(msg);
    }

    static void fatal(String msg) {
        System.err.println("SimplePostTool: FATAL: " + msg);
        System.exit(2);
    }

    /**
     * Does a simple commit operation
     */
    public void commit() {

        info("COMMITting Solr index changes to " + solrUrl + "..");
        doGet(appendParam(solrUrl.toString(), "commit=true"));
    }

    /**
     * Does a simple optimize operation
     */
    public void optimize() {

        info("Performing an OPTIMIZE to " + solrUrl + "..");
        doGet(appendParam(solrUrl.toString(), "optimize=true"));
    }

    /**
     * Appends a URL query parameter to a URL
     *
     * @param url the original URL
     * @param param the parameter(s) to append, separated by "&"
     * @return the string version of the resulting URL
     */
    public static String appendParam(String url, String param) {

        String[] pa = param.split("&");
        for (String p : pa) {
            if (p.trim().length() == 0) {
                continue;
            }
            String[] kv = p.split("=");
            if (kv.length == 2) {
                url = url + (url.indexOf('?') > 0 ? "&" : "?") + kv[0] + "=" + kv[1];
            }
            else {
                warn("Skipping param " + p + " which is not on form key=value");
            }
        }
        return url;
    }

    /**
     * Opens the file and posts it's contents to the solrUrl, writes to response
     * to output.
     */
    public void postFile(File file, OutputStream output, String type) {

        InputStream is = null;
        try {
            URL url = solrUrl;
            if (auto) {
                if (type == null) {
                    type = guessType(file);
                }
                if (type != null) {
                    if (type.equals("text/xml") || type.equals("text/csv") || type.equals("application/json")) {
                        // Default handler
                    } else {
                        // SolrCell
                        String urlStr = appendUrlPath(solrUrl, "/extract").toString();
                        if (urlStr.indexOf("resource.name") == -1) {
                            urlStr = appendParam(urlStr, "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
                        }
                        if (urlStr.indexOf("literal.id") == -1) {
                            urlStr = appendParam(urlStr, "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
                        }
                        url = new URL(urlStr);
                    }
                }
                else {
                    warn("Skipping " + file.getName() + ". Unsupported file type for auto mode.");
                    return;
                }
            }
            else {
                if (type == null) {
                    type = DEFAULT_CONTENT_TYPE;
                }
            }
            info("POSTing file " + file.getName() + (auto ? " (" + type + ")" : ""));
            is = new FileInputStream(file);
            postData(is, (int) file.length(), output, type, url);
        }
        catch (IOException e) {
            warn("Can't open/read file: " + file);
        }
        finally {
            try {
                if (is != null) {
                    is.close();
                }
            }
            catch (IOException e) {
                fatal("IOException while closing file: " + e);
            }
        }
    }

    /**
     * Appends to the path of the URL
     *
     * @param url the URL
     * @param append the path to append
     * @return the final URL version
     */
    protected static URL appendUrlPath(URL url, String append) throws MalformedURLException {

        return new URL(url.getProtocol() + "://" + url.getAuthority() + url.getPath() + append + (url.getQuery() != null ? "?" + url.getQuery() : ""));
    }

    /**
     * Guesses the type of a file, based on file name suffix
     *
     * @param file the file
     * @return the content-type guessed
     */
    protected static String guessType(File file) {

        String name = file.getName();
        String suffix = name.substring(name.lastIndexOf(".") + 1);
        return mimeMap.get(suffix.toLowerCase(Locale.ROOT));
    }

    /**
     * Performs a simple get on the given URL
     */
    public static void doGet(String url) {

        try {
            doGet(new URL(url));
        }
        catch (MalformedURLException e) {
            warn("The specified URL " + url + " is not a valid URL. Please check");
        }
    }

    /**
     * Performs a simple get on the given URL
     */
    public static void doGet(URL url) {

        if (mockMode) { return; }

        try {
            HttpURLConnection urlc = (HttpURLConnection) url.openConnection();
            if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
                warn("Solr returned an error #" + urlc.getResponseCode() + " " + urlc.getResponseMessage() + " for url " + url);
            }
        }
        catch (IOException e) {
            warn("An error occured posting data to " + url + ". Please check that Solr is running.");
        }
    }

    /**
     * Reads data from the data stream and posts it to solr, writes to the
     * response to output
     *
     * @return true if success
     */
    public boolean postData(InputStream data, Integer length, OutputStream output, String type, URL url) {

        if (mockMode) {
            return true;
        }
        boolean success = true;
        if (type == null) {
            type = DEFAULT_CONTENT_TYPE;
        }
        HttpURLConnection urlc = null;
        try {
            try {
                urlc = (HttpURLConnection) url.openConnection();
                try {
                    urlc.setRequestMethod("POST");
                }
                catch (ProtocolException e) {
                    fatal("Shouldn't happen: HttpURLConnection doesn't support POST??" + e);
                }
                urlc.setDoOutput(true);
                urlc.setDoInput(true);
                urlc.setUseCaches(false);
                urlc.setAllowUserInteraction(false);
                urlc.setRequestProperty("Content-type", type);

                if (null != length) {
                    urlc.setFixedLengthStreamingMode(length);
                }

            }
            catch (IOException e) {
                fatal("Connection error (is Solr running at " + solrUrl + " ?): " + e);
                success = false;
            }

            try(OutputStream lout = urlc.getOutputStream()) {
                pipe(data, lout);
            }
            catch (IOException e) {
                fatal("IOException while posting data: " + e);
                success = false;
            }

            try(InputStream in = urlc.getInputStream()) {
                if (HttpURLConnection.HTTP_OK != urlc.getResponseCode()) {
                    warn("Solr returned an error #" + urlc.getResponseCode() + " " + urlc.getResponseMessage());
                    success = false;
                }

                pipe(in, output);
            }
            catch (IOException e) {
                warn("IOException while reading response: " + e);
                success = false;
            }

        }
        finally {
            if (urlc != null) {
                urlc.disconnect();
            }
        }
        return success;
    }

    /**
     * Converts a string to an input stream
     *
     * @param s the string
     * @return the input stream
     */
    public static InputStream stringToStream(String s) {

        InputStream is = null;
        try {
            is = new ByteArrayInputStream(s.getBytes("UTF-8"));
        }
        catch (UnsupportedEncodingException e) {
            fatal("Shouldn't happen: UTF-8 not supported?!?!?!");
        }
        return is;
    }

    /**
     * Pipes everything from the source to the dest. If dest is null, then
     * everything is read from source and thrown away.
     */
    private static void pipe(InputStream source, OutputStream dest) throws IOException {

        byte[] buf = new byte[1024];
        int read = 0;
        while ((read = source.read(buf)) >= 0) {
            if (null != dest) {
                dest.write(buf, 0, read);
            }
        }
        if (null != dest) {
            dest.flush();
        }
    }

    public GlobFileFilter getFileFilterFromFileTypes(String fileTypes) {

        String glob;
        if (fileTypes.equals("*")) {
            glob = ".*";
        } else {
            glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
        }
        return new GlobFileFilter(glob, true);
    }

    //
    // Utility methods for XPath handing
    //
    // Gets all nodes matching an XPath
    public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException {

        XPathFactory factory = XPathFactory.newInstance();
        XPath xp = factory.newXPath();
        XPathExpression expr = xp.compile(xpath);
        return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
    }

    /**
     * Gets the string content of the matching an XPath
     *
     * @param n the node (or doc)
     * @param xpath the xpath string
     * @param concatAll if true, text from all matching nodes will be
     * concatenated, else only the first returned
     */
    public static String getXP(Node n, String xpath, boolean concatAll) throws XPathExpressionException {

        NodeList nodes = getNodesFromXP(n, xpath);
        StringBuilder sb = new StringBuilder();
        if (nodes.getLength() > 0) {
            for (int i = 0; i < nodes.getLength(); i++) {
                sb.append(nodes.item(i).getNodeValue()).append(" ");
                if (!concatAll) {
                    break;
                }
            }
            return sb.toString().trim();
        }
        else {
            return "";
        }
    }

    /**
     * Takes a string as input and returns a DOM
     */
    public static Document makeDom(String in, String inputEncoding) throws SAXException, IOException, ParserConfigurationException {

        InputStream is = new ByteArrayInputStream(in.getBytes(inputEncoding));
        Document dom = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(is);

        return dom;
    }

    /**
     * Inner class to filter files based on glob wildcards
     */
    class GlobFileFilter implements FileFilter {

        private String _pattern;
        private Pattern p;

        public GlobFileFilter(String pattern, boolean isRegex) {
            _pattern = pattern;
            if (!isRegex) {
                _pattern = _pattern
                        .replace("^", "\\^")
                        .replace("$", "\\$")
                        .replace(".", "\\.")
                        .replace("(", "\\(")
                        .replace(")", "\\)")
                        .replace("+", "\\+")
                        .replace("*", ".*")
                        .replace("?", ".");
                _pattern = "^" + _pattern + "$";
            }

            try {
                p = Pattern.compile(_pattern, Pattern.CASE_INSENSITIVE);
            }
            catch (PatternSyntaxException e) {
                fatal("Invalid type list " + pattern + ". " + e.getDescription());
            }
        }

        @Override
        public boolean accept(File file) {
            return p.matcher(file.getName()).find();
        }
    }

    //
    // Simple crawler class which can fetch a page and check for robots.txt
    //
    class PageFetcher {

        Map<String, List<String>> robotsCache;
        final String DISALLOW = "Disallow:";

        public PageFetcher() {
            robotsCache = new HashMap<>();
        }

        public PageFetcherResult readPageFromUrl(URL u) {

            PageFetcherResult res = new PageFetcherResult();
            try {
                if (isDisallowedByRobots(u)) {
                    warn("The URL " + u + " is disallowed by robots.txt and will not be crawled.");
                    res.httpStatus = 403;
                    visited.add(u);
                    return res;
                }
                res.httpStatus = 404;
                HttpURLConnection conn = (HttpURLConnection) u.openConnection();
                conn.setRequestProperty("User-Agent", "SimplePostTool-crawler/" + VERSION_OF_THIS_TOOL + " (http://lucene.apache.org/solr/)");
                conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
                conn.connect();
                res.httpStatus = conn.getResponseCode();
                if (!normalizeUrlEnding(conn.getURL().toString()).equals(normalizeUrlEnding(u.toString()))) {
                    info("The URL " + u + " caused a redirect to " + conn.getURL());
                    u = conn.getURL();
                    res.redirectUrl = u;
                    visited.add(u);
                }
                if (res.httpStatus == 200) {
                    // Raw content type of form "text/html; encoding=utf-8"
                    String rawContentType = conn.getContentType();
                    String type = rawContentType.split(";")[0];
                    if (typeSupported(type)) {
                        String encoding = conn.getContentEncoding();
                        InputStream is;
                        if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
                            is = new GZIPInputStream(conn.getInputStream());
                        }
                        else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
                            is = new InflaterInputStream(conn.getInputStream(), new Inflater(true));
                        }
                        else {
                            is = conn.getInputStream();
                        }

                        // Read into memory, so that we later can pull links from the page without re-fetching 
                        res.content = inputStreamToByteArray(is);
                        is.close();
                    }
                    else {
                        warn("Skipping URL with unsupported type " + type);
                        res.httpStatus = 415;
                    }
                }
            }
            catch (IOException e) {
                warn("IOException when reading page from url " + u + ": " + e.getMessage());
            }

            return res;
        }

        public boolean isDisallowedByRobots(URL url) {

            String host = url.getHost();
            String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
            List<String> disallows = robotsCache.get(host);
            if (disallows == null) {
                disallows = new ArrayList<>();

                URL urlRobot;
                try {
                    urlRobot = new URL(strRobot);
                    disallows = parseRobotsTxt(urlRobot.openStream());
                }
                catch (MalformedURLException e) {
                    return true; // We cannot trust this robots URL, should not happen
                }
                catch (IOException e) {
                    // There is no robots.txt, will cache an empty disallow list
                }
            }

            robotsCache.put(host, disallows);

            String strURL = url.getFile();
            for (String path : disallows) {
                if (path.equals("/") || strURL.indexOf(path) == 0) {
                    return true;
                }
            }
            return false;
        }

        /**
         * Very simple robots.txt parser which obeys all Disallow lines
         * regardless of user agent or whether there are valid Allow: lines.
         *
         * @param is Input stream of the robots.txt file
         * @return a list of disallow paths
         * @throws IOException if problems reading the stream
         */
        protected List<String> parseRobotsTxt(InputStream is) throws IOException {

            List<String> disallows = new ArrayList<>();
            BufferedReader r = new BufferedReader(new InputStreamReader(is, "UTF-8"));
            String l;
            while ((l = r.readLine()) != null) {
                String[] arr = l.split("#");
                if (arr.length == 0) {
                    continue;
                }
                l = arr[0].trim();
                if (l.startsWith(DISALLOW)) {
                    l = l.substring(DISALLOW.length()).trim();
                    if (l.length() == 0) {
                        continue;
                    }
                    disallows.add(l);
                }
            }
            is.close();
            return disallows;
        }

        /**
         * Finds links on a web page, using /extract?extractOnly=true
         *
         * @param u the URL of the web page
         * @param is the input stream of the page
         * @param type the content-type
         * @param postUrl the URL (typically /solr/extract) in order to pull out
         * links
         * @return a set of URLs parsed from the page
         */
        protected Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) {

            Set<URL> l = new HashSet<>();
            URL url = null;
            try {
                ByteArrayOutputStream os = new ByteArrayOutputStream();
                URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true"));
                boolean success = postData(is, null, os, type, extractUrl);
                if (success) {
                    String rawXml = os.toString("UTF-8");
                    Document d = makeDom(rawXml, "UTF-8");
                    String innerXml = getXP(d, "/response/str/text()[1]", false);
                    d = makeDom(innerXml, "UTF-8");
                    NodeList links = getNodesFromXP(d, "/html/body//a/@href");
                    for (int i = 0; i < links.getLength(); i++) {
                        String link = links.item(i).getTextContent();
                        link = computeFullUrl(u, link);
                        if (link == null) {
                            continue;
                        }
                        url = new URL(link);
                        if (url.getAuthority() == null || !url.getAuthority().equals(u.getAuthority())) {
                            continue;
                        }
                        l.add(url);
                    }
                }
            }
            catch (MalformedURLException e) {
                warn("Malformed URL " + url);
            }
            catch (IOException e) {
                warn("IOException opening URL " + url + ": " + e.getMessage());
            }
            catch (Exception e) {
                throw new RuntimeException();
            }
            return l;
        }
    }

    /**
     * Utility class to hold the result form a page fetch
     */
    public class PageFetcherResult {

        int httpStatus = 200;
        String contentType = "text/html";
        URL redirectUrl = null;
        byte[] content;
    }
}
